Get crime data from ArcGIS API and remove (4) entries with missing geometry; write cleaned data to GeoJSON
# crime_dg <- st_read("https://utility.arcgis.com/usrsvcs/servers/0eaa6be357a74f5280157125e9b547fc/rest/services/OpenData_PublicSafety/FeatureServer/2/query?outFields=*&where=1%3D1&f=geojson")
# crime_dg$Date <- as.Date(fread(".//data//crime_dg.csv")$Date, "%Y/%m/%d")
# crime_dg <- crime_dg[!st_is_empty(crime_dg),]
# st_write(crime_dg, ".//data//crime_dg.geojson")
Read in data
crime_dg <- st_read(".//data//crime_dg.geojson")
## Reading layer `crime_dg' from data source
## `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\crime_dg.geojson'
## using driver `GeoJSON'
## Simple feature collection with 709898 features and 12 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -72.71865 ymin: 41.72215 xmax: -72.64829 ymax: 41.80744
## Geodetic CRS: WGS 84
parcel_dg <- st_read(".//data//parcel_hartford.geojson")
## Reading layer `parcel_hartford' from data source
## `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\parcel_hartford.geojson'
## using driver `GeoJSON'
## Simple feature collection with 19143 features and 47 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -72.7182 ymin: 41.72368 xmax: -72.64208 ymax: 41.80743
## Geodetic CRS: WGS 84
Census tracts from Tigris for map baselayer
hartford_census_data <- get_acs(geography = "tract", variable = c("B19013_001", "B01001_001"), state = "CT", county = "Hartford", year = 2021) |>
dplyr::select(GEOID, variable, estimate) |>
tidyr::pivot_wider(names_from = variable, values_from = estimate) |>
dplyr::rename(population = "B01001_001", med_income = "B19013_001")
hartford_tracts <- st_filter(tracts(state = "CT"),
subset(county_subdivisions(state = "CT"), NAMELSAD == "Hartford town"),
.predicate = st_within)
hartford_tracts <- merge(hartford_tracts, hartford_census_data, by = "GEOID")
water <- st_intersection(area_water("CT", "Hartford"), hartford_tracts)
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
crime_dg <- crime_dg |> subset(year(Date) %in% 2016:2021) |> st_transform(st_crs(hartford_tracts))
parcel_dg <- parcel_dg |> subset(State_Use_Description == "ONE FAMILY") |> st_transform(st_crs(hartford_tracts))
plot
ggplot(crime_dg) +
geom_sf(data = hartford_tracts) +
geom_sf(data = water, color = "blue", linewidth = 0.35) +
geom_hex(aes(X, Y, fill = log10(..count..)), data = ~cbind(.x, st_coordinates(.x)), alpha = 0.75, bins = 50) +
scale_fill_binned() +
scale_x_continuous(guide = guide_axis(angle = 45)) +
labs(x = "Latitude", y = "Longitude") +
theme_bw()
g <- ggplot(dplyr::sample_n(crime_dg, 1e3)) +
geom_sf(data = hartford_tracts, aes(label1 = GEOID,
label2 = med_income,
label3 = population)) +
geom_sf(data = dplyr::sample_n(parcel_dg, 1e3)) +
geom_density_2d(aes(X,Y), data = ~cbind(.x, st_coordinates(.x))) +
stat_sf_coordinates(size = 0.1, color = "red") +
labs(x = "Latitude", y = "Longitude") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45))
p <- toWebGL(ggplotly(g))
p$x$data[[4]]$hoverinfo <- "none"
p
The filters applied to the data are:
The manually curated variables are:
Price: Assessed total value of the parcelThefts: Number of thefts (robberies, burglaries,
larceny, theft, or stolen property) within 150 meters of the parcelViolence: Number of violent crimes (assaults,
homicides, shootings) within 150 meters of the parcelLiving_Area: Living area of the parcelEffective_Area: Effective area of the parcelYear: Approximate year the parcel was builtBed: Number of bedrooms in the parcelBath: Number of bathrooms in the parcelThe highly correlated numeric covariates are
There appear to be more thefts and violent crimes near single family
homes in dilapidated condition. Homes in worse condition
tend to be older, but there are also older homes in good condition.
There is not a clear relationship between the year the parcel was built and its condition.
All covariates appear to correlate with the response variable,
Assessed_Total.
parcel_dg %<>% subset(select = c("Assessed_Total", "Living_Area", "Effective_Area", "AYB",
"Number_of_Bedroom", "Number_of_Baths", "Condition_Description"))
parcel_dg %<>% dplyr::rename(Price = "Assessed_Total", Year = "AYB", Bed = "Number_of_Bedroom",
Bath = "Number_of_Baths", Condition = "Condition_Description")
parcel_dg$Condition %<>% factor(levels = c("Dilapidated", "Very Poor", "Poor", "Fair", "Fair-Avg", "Average",
"Avg-Good", "Good", "Good-VG", "Very Good", "Excellent"))
paste0("Dropping n=", nrow(parcel_dg) - nrow(na.omit(parcel_dg)), " rows with NAs.")
## [1] "Dropping n=19 rows with NAs."
X <- na.omit(parcel_dg)
stealing <- crime_dg[grep("ROBBERY|BURGLARY|LARCENY|THEFT", crime_dg$UCR_1_Category), ]
violence <- crime_dg[grep("ASSAULT|HOMICIDE|SHOOTING", crime_dg$UCR_1_Category), ]
sf_use_s2(FALSE)
X$Thefts <- st_is_within_distance(st_transform(X, crs = 26956),
st_transform(stealing, crs = 26956),
dist = units::set_units(150, "m")) |> sapply(length)
X$Violence <- st_is_within_distance(st_transform(X, crs = 26956),
st_transform(violence, crs = 26956),
dist = units::set_units(150, "m")) |> sapply(length)
X$GEOID <- hartford_tracts$GEOID[st_intersects(X, hartford_tracts) |> sapply(head, 1)]
setDT(copy(X))[as.data.table(hartford_tracts),
.(violence_rate = sum(Violence)/i.population,
avg_property_value = mean(Price)),
on = "GEOID", by = .EACHI]
parcel_proj <- X |> st_transform(crs = 26956) |> st_centroid() |> st_coordinates()
## Warning: st_centroid assumes attributes are constant over geometries
stealing_proj <- stealing |> st_transform(crs = 26956) |> st_coordinates()
X$Thefts_kde <- kde(stealing_proj, Hscv(stealing_proj)) |> predict(x = parcel_proj)
X_plot_vars <- st_drop_geometry(X)[c("Thefts", "Violence", "Living_Area", "Effective_Area",
"Year", "Bed", "Bath", "Condition", "Price")]
# Plot pairwise correlation between numeric predictors
X_plot_vars |>
GGally::ggpairs(lower = list(continuous = "points")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot boxplots of numeric predictors with respect to condition description
# Pivot data longer so that we can facet by variable
X_plot_vars |>
tidyr::pivot_longer(!Condition) |>
ggplot(aes(x = name, y = value, fill = Condition)) +
geom_boxplot(position = position_dodge(width = 1)) +
facet_wrap(~name, scales = "free") +
labs(x = NULL, y = NULL) +
theme_bw()
##
Analysis
Variable Selection. We leave out Thefts since
it is highly correlated with Violence but has a lower
correlation with Price than Violence does. For
similar reasons, we leave out Effective_Area and keep
Living_Area. We keep both Bed and
Bath for now.
Transformations. From the correlation plots, we can see that
Price, Living_Area, and Violence
are right-skewed. To adjust for this, we take the log of
Price and the square root of Living_Area and
Violence.
# Fit linear models with no transformations
m0 <- lm(Price ~ Violence + Living_Area + Year + Bed + Bath + Condition,
data = X)
summary(m0)
##
## Call:
## lm(formula = Price ~ Violence + Living_Area + Year + Bed + Bath +
## Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75062 -7023 856 6819 110854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.458e+05 1.440e+04 -17.063 < 2e-16 ***
## Violence -2.521e+02 6.990e+00 -36.065 < 2e-16 ***
## Living_Area 3.349e+01 3.255e-01 102.893 < 2e-16 ***
## Year 1.132e+02 6.656e+00 17.002 < 2e-16 ***
## Bed -2.306e+03 2.293e+02 -10.061 < 2e-16 ***
## Bath 5.426e+03 3.556e+02 15.258 < 2e-16 ***
## ConditionVery Poor 6.029e+02 8.696e+03 0.069 0.9447
## ConditionPoor 1.345e+04 6.813e+03 1.974 0.0484 *
## ConditionFair 2.685e+04 6.408e+03 4.190 2.82e-05 ***
## ConditionFair-Avg 3.280e+04 6.258e+03 5.241 1.65e-07 ***
## ConditionAverage 4.180e+04 6.163e+03 6.784 1.27e-11 ***
## ConditionAvg-Good 4.936e+04 6.158e+03 8.017 1.26e-15 ***
## ConditionGood 5.025e+04 6.157e+03 8.161 3.90e-16 ***
## ConditionGood-VG 5.442e+04 6.176e+03 8.810 < 2e-16 ***
## ConditionVery Good 5.704e+04 6.179e+03 9.231 < 2e-16 ***
## ConditionExcellent 6.455e+04 6.330e+03 10.197 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13740 on 7204 degrees of freedom
## Multiple R-squared: 0.8252, Adjusted R-squared: 0.8248
## F-statistic: 2267 on 15 and 7204 DF, p-value: < 2.2e-16
# Plot diagnostics
par(mfrow = c(2, 2))
plot(m0)
# Fit linear models with transformations
m1 <- lm(log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + Year +
Bed + Bath + Condition,
data = X)
summary(m1)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + sqrt(Living_Area) +
## Year + Bed + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27511 -0.07695 0.01528 0.09805 0.72980
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.884e+00 1.771e-01 38.860 < 2e-16 ***
## sqrt(Violence) -4.028e-02 9.175e-04 -43.908 < 2e-16 ***
## sqrt(Living_Area) 2.823e-02 3.803e-04 74.234 < 2e-16 ***
## Year 9.114e-04 8.078e-05 11.282 < 2e-16 ***
## Bed -9.644e-03 2.818e-03 -3.422 0.000626 ***
## Bath 4.061e-02 4.161e-03 9.759 < 2e-16 ***
## ConditionVery Poor 7.605e-01 1.045e-01 7.279 3.71e-13 ***
## ConditionPoor 9.360e-01 8.185e-02 11.436 < 2e-16 ***
## ConditionFair 1.068e+00 7.699e-02 13.875 < 2e-16 ***
## ConditionFair-Avg 1.131e+00 7.519e-02 15.043 < 2e-16 ***
## ConditionAverage 1.377e+00 7.404e-02 18.603 < 2e-16 ***
## ConditionAvg-Good 1.500e+00 7.398e-02 20.271 < 2e-16 ***
## ConditionGood 1.517e+00 7.398e-02 20.507 < 2e-16 ***
## ConditionGood-VG 1.552e+00 7.421e-02 20.911 < 2e-16 ***
## ConditionVery Good 1.580e+00 7.424e-02 21.277 < 2e-16 ***
## ConditionExcellent 1.650e+00 7.605e-02 21.702 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1651 on 7204 degrees of freedom
## Multiple R-squared: 0.7622, Adjusted R-squared: 0.7617
## F-statistic: 1540 on 15 and 7204 DF, p-value: < 2.2e-16
plot(m1)
There are many parcels beyond 4 standard errors below the regression line. This means the model is severely overestimating the value of these parcels. Let’s find out what these parcels are.
Average condition, although the majority of them are
Average or worse.Solution. Refer to the previous correlation plots and
observe that Price and Living Area appear to have a strongly correlated
linear correlation. However, we performed a log
transformation on the Price variable but a square-root transformation on
the Living Area variable. This asymmetry may be the cause of the large
residuals.
# Investigate the large negative residuals
r <- residuals(m1) / summary(m1)$sigma # standard residuals
o <- cbind(X, "StdResid" = r)[r < -4, ]
nrow(o)
## [1] 31
# Plot the offending parcels geographically
g <- ggplot(o) +
geom_sf(data = hartford_tracts) +
stat_sf_coordinates(size = 1, color = "red") +
labs(x = "Latitude", y = "Longitude") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations
give.n <- function(x){
return(data.frame(
y = quantile(x, .75) + 0.1,
label = paste0("n = ", length(x))
))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) +
geom_boxplot() +
stat_summary(fun.data = give.n, geom = "text", fun.y = median) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_bw()
# Plot residuals vs transformed numeric covariates
o <- o |>
st_drop_geometry() |>
mutate(logPrice = log(Price),
sqrtViolence = sqrt(Violence),
sqrtLiving_Area = sqrt(Living_Area))
GGally::ggpairs(o, columns = c("StdResid", "logPrice", "sqrtViolence", "sqrtLiving_Area", "Bed", "Bath", "Thefts"), lower = list(continuous = "points"),
progress = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Let’s test our hypothesis.
Living_Area, the residuals are
more symmetrically distributed. We may have sacrificed some upper tail
normality for the lower tail, since there appear to be more residuals
larger than 4 now. However, we have reduced the total number of
residuals beyond 4 SE’s from 33 to 22.# Take log of living area
m2 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year +
Bed + Bath + Condition,
data = X)
summary(m2)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) +
## Year + Bed + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28251 -0.08783 0.01367 0.10248 1.09006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.174e+00 2.016e-01 20.709 < 2e-16 ***
## sqrt(Violence) -4.364e-02 9.532e-04 -45.777 < 2e-16 ***
## log(Living_Area) 5.548e-01 8.271e-03 67.070 < 2e-16 ***
## Year 7.687e-04 8.401e-05 9.150 < 2e-16 ***
## Bed -8.052e-04 2.920e-03 -0.276 0.783
## Bath 7.505e-02 4.174e-03 17.982 < 2e-16 ***
## ConditionVery Poor 7.686e-01 1.089e-01 7.059 1.84e-12 ***
## ConditionPoor 9.315e-01 8.531e-02 10.919 < 2e-16 ***
## ConditionFair 1.068e+00 8.025e-02 13.305 < 2e-16 ***
## ConditionFair-Avg 1.125e+00 7.838e-02 14.354 < 2e-16 ***
## ConditionAverage 1.365e+00 7.718e-02 17.691 < 2e-16 ***
## ConditionAvg-Good 1.485e+00 7.711e-02 19.252 < 2e-16 ***
## ConditionGood 1.500e+00 7.711e-02 19.450 < 2e-16 ***
## ConditionGood-VG 1.536e+00 7.735e-02 19.863 < 2e-16 ***
## ConditionVery Good 1.567e+00 7.738e-02 20.250 < 2e-16 ***
## ConditionExcellent 1.642e+00 7.927e-02 20.718 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1721 on 7204 degrees of freedom
## Multiple R-squared: 0.7417, Adjusted R-squared: 0.7411
## F-statistic: 1379 on 15 and 7204 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m2)
# Drop number of bedrooms
m3 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year +
Bath + Condition,
data = X)
summary(m3)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) +
## Year + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28246 -0.08786 0.01381 0.10273 1.08541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.180e+00 2.005e-01 20.844 < 2e-16 ***
## sqrt(Violence) -4.366e-02 9.488e-04 -46.017 < 2e-16 ***
## log(Living_Area) 5.535e-01 6.998e-03 79.099 < 2e-16 ***
## Year 7.694e-04 8.396e-05 9.164 < 2e-16 ***
## Bath 7.485e-02 4.105e-03 18.231 < 2e-16 ***
## ConditionVery Poor 7.685e-01 1.089e-01 7.058 1.84e-12 ***
## ConditionPoor 9.311e-01 8.530e-02 10.916 < 2e-16 ***
## ConditionFair 1.067e+00 8.023e-02 13.303 < 2e-16 ***
## ConditionFair-Avg 1.125e+00 7.836e-02 14.352 < 2e-16 ***
## ConditionAverage 1.365e+00 7.716e-02 17.690 < 2e-16 ***
## ConditionAvg-Good 1.484e+00 7.710e-02 19.252 < 2e-16 ***
## ConditionGood 1.499e+00 7.710e-02 19.449 < 2e-16 ***
## ConditionGood-VG 1.536e+00 7.733e-02 19.863 < 2e-16 ***
## ConditionVery Good 1.567e+00 7.737e-02 20.249 < 2e-16 ***
## ConditionExcellent 1.642e+00 7.926e-02 20.718 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.172 on 7205 degrees of freedom
## Multiple R-squared: 0.7416, Adjusted R-squared: 0.7411
## F-statistic: 1477 on 14 and 7205 DF, p-value: < 2.2e-16
plot(m3)
# Compare residuals beyond 4 SE's
r1 <- residuals(m1) / summary(m1)$sigma
sum(r1 < -4 | r1 > 4)
## [1] 33
r3 <- residuals(m3) / summary(m3)$sigma
sum(r3 < -4 | r3 > 4)
## [1] 22
Repeating the same residuals analysis from before, we see that there are no longer any strongly correlated covariates with the residuals.
# Investigate the large negative residuals
r <- residuals(m3) / summary(m3)$sigma # standard residuals
o <- cbind(X, "StdResid" = r)[r < -4, ]
nrow(o)
## [1] 18
# Plot the offending parcels geographically
g <- ggplot(o) +
geom_sf(data = hartford_tracts) +
stat_sf_coordinates(size = 1, color = "red") +
labs(x = "Latitude", y = "Longitude") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations
give.n <- function(x){
return(data.frame(
y = quantile(x, .75) + 0.1,
label = paste0("n = ", length(x))
))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) +
geom_boxplot() +
stat_summary(fun.data = give.n, geom = "text", fun.y = median) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_bw()
# Plot residuals vs transformed numeric covariates
o <- o |>
st_drop_geometry() |>
mutate(logPrice = log(Price),
sqrtViolence = sqrt(Violence),
logLiving_Area = log(Living_Area))
GGally::ggpairs(o, columns = c("StdResid", "logPrice", "sqrtViolence", "logLiving_Area", "Bath", "Thefts"), lower = list(continuous = "points"),
progress = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))